dataNHANES <- read.table("~/Desktop/Temperature study/ICPSR_08055/DS0001/08055-0001-Data.txt", header = FALSE,colClasses=c("character"))
dataNHANES$study_ID <- paste("NH_",substr(dataNHANES[,1],1,19),sep="") # this is not ID

dataNHANES$sample_weights <- as.numeric(substr(dataNHANES[,1], 204,209))
dataNHANES$temp <-as.numeric(substr(dataNHANES[,1],346,346+3)) * 0.1

tmp_time  <- as.factor(substr(dataNHANES[,1], 632,632+3))
tmp_time_pm  <- as.factor(substr(dataNHANES[,1], 636,636))
tmp_time_valid <-tmp_time != '0000' & tmp_time != '8888'
dataNHANES$time_HR[tmp_time_valid] <-  as.numeric(substr(tmp_time[tmp_time_valid] ,1,2)) + as.numeric(substr(tmp_time[tmp_time_valid] ,3,4)) / 60 + (tmp_time_pm[tmp_time_valid] == 2 & substr(tmp_time[tmp_time_valid] ,1,2) != "12") * 12 - (tmp_time_pm[tmp_time_valid] == 1 & substr(tmp_time[tmp_time_valid] ,1,2) == "12") * 12 



dataNHANES$race <- substr(dataNHANES[,1], 110,110)
dataNHANES$race[dataNHANES$race == 1] <- "white"
dataNHANES$race[dataNHANES$race == 2] <- "black"
dataNHANES$race[dataNHANES$race == 3] <- "other"
dataNHANES$race <- as.factor(dataNHANES$race)

dataNHANES$sex <- substr(dataNHANES[,1],111,111)
dataNHANES$sex[dataNHANES$sex == 1] <- "male"
dataNHANES$sex[dataNHANES$sex == 2] <- "female"
dataNHANES$sex <- as.factor(dataNHANES$sex)


dataNHANES$age <- as.numeric(substr(dataNHANES[,1],151,151+1))
dataNHANES$year_of_birth <- as.numeric(substr(dataNHANES[,1],115,116)) + ifelse(as.numeric(substr(dataNHANES[,1],115,116)) > 70, 1800, 1900)

dataNHANES$exam_date <-as.Date(substr(dataNHANES [,1],145,145+5),"%m%d%y")
dataNHANES$exam_year <- 1900 + as.numeric(substr(dataNHANES [,1],149,149+1))
dataNHANES$exammonth <- factor(substr(dataNHANES[,1], 145,145+1),levels=c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"))
dataNHANES$exam_findings <-as.factor(substr(dataNHANES [,1],307,307))
dataNHANES$exam_ICD <-as.factor(substr(dataNHANES [,1],308,308+3))
dataNHANES$exam_ICD2 <-as.factor(substr(dataNHANES [,1],312,312+3))

dataNHANES$region <-(substr(dataNHANES [,1],157,157))
dataNHANES$region[dataNHANES$region == "1"] <- "NORTHEAST"
dataNHANES$region[dataNHANES$region == "2"] <- "MIDWEST"
dataNHANES$region[dataNHANES$region == "3"] <- "SOUTH"
dataNHANES$region[dataNHANES$region == "4"] <- "WEST"
#dataNHANES$region [dataNHANES$region %in% c( "8", "9", "0")] <- NA
dataNHANES$region <- as.factor(dataNHANES$region)

# breaking down into birth cohorts 
dataNHANES$birth_cohort <- NULL
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1890 &dataNHANES$year_of_birth >=1880] <-"1880s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1900 &dataNHANES$year_of_birth >=1890] <-"1890s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1910 &dataNHANES$year_of_birth >=1900] <-"1900s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1920 &dataNHANES$year_of_birth >=1910] <-"1910s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1930 &dataNHANES$year_of_birth >=1920] <-"1920s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1940 &dataNHANES$year_of_birth >=1930] <-"1930s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1950 &dataNHANES$year_of_birth >=1940] <-"1940s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1960 &dataNHANES$year_of_birth >=1950] <-"1950s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1970 &dataNHANES$year_of_birth >=1960] <-"1960s"
dataNHANES$birth_cohort[ dataNHANES$year_of_birth <1980 &dataNHANES$year_of_birth >=1970] <-"1970s"
dataNHANES$birth_cohort <- as.factor (dataNHANES$birth_cohort)

# exctracting data about medical examination findings
dataNHANES$head_eyes_ears_nose_findings <-as.factor(substr(dataNHANES [,1],221,221)) == "2"
dataNHANES$thyroid_findings <-as.factor(substr(dataNHANES [,1],248,248)) == "2"
dataNHANES$chest_findings <-as.factor(substr(dataNHANES [,1],253,253)) == "2"
dataNHANES$cardiovascular_findings <-as.factor(substr(dataNHANES [,1],260,260)) == "2"
dataNHANES$abdominal_findings <-as.factor(substr(dataNHANES [,1],265,265)) == "2"
dataNHANES$musculosceletal_findings <-as.factor(substr(dataNHANES [,1],285,285)) == "2"
dataNHANES$neurological_findings <-as.factor(substr(dataNHANES [,1],290,290)) == "2"
dataNHANES$skin_findings <-as.factor(substr(dataNHANES [,1],297,297)) == "2"

dataNHANES$general_findings <-as.factor(substr(dataNHANES [,1],307,307)) == "2"

dataNHANES$no_findings<- dataNHANES$head_eyes_ears_nose_findings & dataNHANES$thyroid_findings & dataNHANES$chest_findings &
dataNHANES$cardiovascular_findings & dataNHANES$abdominal_findings & dataNHANES$musculosceletal_findings & dataNHANES$neurological_findings & dataNHANES$skin_findings

#dataNHANES$obesity <-as.factor(substr(dataNHANES [,1],345,345))
#dataNHANES$obesity[dataNHANES$obesity == 1] <- "yes"
#dataNHANES$obesity[dataNHANES$obesity == 2] <- "no"
dataNHANES$thyroid<-as.factor(substr(dataNHANES [,1],247,247))

#Importing weight data 
NHANESweight <-read.table ("~/Desktop/Temperature study/ICPSR_08058/DS0001/08058-0001-Data.txt",header = FALSE,colClasses=c("character"))
 dataNHANES$weight_KG  <- as.numeric(substr(NHANESweight[,1],263,263+4)) *0.01
 dataNHANES$height_CM  <- as.numeric(substr(NHANESweight[,1],269,269+4))*0.01
 dataNHANES$BMI <- dataNHANES $weight_KG / ((dataNHANES $height_CM*0.01)^2)


 dataNHANES$temp_C <- ( dataNHANES$temp - 32)/1.8
 #NHANESweight$age <- as.numeric(substr( NHANESweight[,1],152,152+1))


dataNHANES$bmi_adj <- residuals(lm(BMI ~ height_CM, data=dataNHANES))

lmNHANES_BMI_adj <-lm(temp_C ~ age + height_CM  + bmi_adj, data=dataNHANES[dataNHANES$sex == "male" & dataNHANES$race == "white", ])
summary(lmNHANES_BMI_adj)



dataNHANES$height_norm <- dataNHANES$height_CM - mean(dataNHANES$height_CM,na.rm=T)
dataNHANES$weight_norm <- dataNHANES$weight_KG - mean(dataNHANES$weight_KG,na.rm=T)


lmNHANES_interaction <- lm(temp_C ~ age + height_norm*weight_norm ,data=dataNHANES[dataNHANES$sex == "male" & dataNHANES$race == "white", ])
summary(lmNHANES_interaction)

# Cleaning data, getting rid of extreme values of weight 0.5% of each side of the tail

qq <- c(0.005,0.995)

dataNHANES_quantiles <- quantile(dataNHANES$temp[dataNHANES$temp > 50 & dataNHANES$temp < 150],qq , na.rm=TRUE)


#dataNHANES_subset <- !is.na(dataNHANES$temp) & dataNHANES$temp > dataNHANES_quantiles[1] & dataNHANES$temp < dataNHANES_quantiles[2] & !is.na(dataNHANES$year_of_birth) & !is.na(dataNHANES$age) & dataNHANES$age>20 & dataNHANES$age<80 & dataNHANES$BMI>10 & dataNHANES$BMI<50 & dataNHANES$weight_KG <888.88 & dataNHANES$height_CM <888.88 & !is.na(dataNHANES$BMI) &!is.na(dataNHANES$weight)


# Cleaning data, getting rid of extreme values,and getting read dataVeterans$temp_C>35 & dataVeterans$temp_C<39 of those who currenly have fever (i_cur1,2,3,4), removing improbable height < 120 cm and >220 cm; and weight>30 & <200 kg

dataNHANES_subset <- !is.na(dataNHANES$temp) & dataNHANES$temp_C >35 & dataNHANES$temp_C <39  & !is.na(dataNHANES$year_of_birth) & !is.na(dataNHANES$age) & dataNHANES$age>20 & dataNHANES$age<80  & dataNHANES$weight_KG <888 & dataNHANES$height_CM <888 & !is.na(dataNHANES$BMI) &!is.na(dataNHANES$weight_KG) & dataNHANES$weight_KG>30 & dataNHANES$weight_KG<200 &  !is.na(dataNHANES$height_CM) &  dataNHANES$height_CM > 120 & dataNHANES$height_CM < 220 


dataNHANES <- dataNHANES[dataNHANES_subset, ]



write.csv(dataNHANES, '~/Desktop/Temperature study/NHANES_processed.csv')













 
 